Inspect & Import data
R tries to import the first sheet of the excel file which resolves in an error. This is why the argument read_excel function has to be used to specify the column.
# Inspect sheets of excel-file
excel_sheets('Spreadsheet_Data.xls')
## [1] "DoubleClick" "Copyright" "Kayak"
# Import data
kayak <- read_excel("Spreadsheet_Data.xls",
sheet = "Kayak")
doubleclick <- read_excel("Spreadsheet_Data.xls",
sheet = "DoubleClick")
Massaging
#Convert to dataframe
doubleclick <- as.data.frame(doubleclick)
# Get a big picture understanding of the data
summary(doubleclick)
## Publisher ID Publisher Name Keyword ID Keyword
## Length:4510 Length:4510 Length:4510 Length:4510
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Match Type Campaign Keyword Group Category
## Length:4510 Length:4510 Length:4510 Length:4510
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Bid Strategy Keyword Type Status Search Engine Bid
## Length:4510 Length:4510 Length:4510 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 3.384
## Mode :character Mode :character Mode :character Median : 6.250
## Mean : 5.435
## 3rd Qu.: 6.250
## Max. :27.500
## Clicks Click Charges Avg. Cost per Click Impressions
## Min. : 0.0 Min. : 0.00 Min. : 0.000 Min. : 0
## 1st Qu.: 1.0 1st Qu.: 2.31 1st Qu.: 0.825 1st Qu.: 28
## Median : 4.0 Median : 6.76 Median : 1.650 Median : 176
## Mean : 113.7 Mean : 167.48 Mean : 1.890 Mean : 9284
## 3rd Qu.: 19.0 3rd Qu.: 28.49 3rd Qu.: 2.663 3rd Qu.: 844
## Max. :34012.0 Max. :46188.44 Max. :10.000 Max. :8342415
## Engine Click Thru % Avg. Pos. Trans. Conv. % Total Cost/ Trans.
## Min. : 0.000 Min. : 0.000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 1.532 1st Qu.: 1.143 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 4.106 Median : 1.594 Median : 0.0000 Median : 0.00
## Mean : 11.141 Mean : 1.930 Mean : 0.5693 Mean : 27.61
## 3rd Qu.: 10.917 3rd Qu.: 2.308 3rd Qu.: 0.0000 3rd Qu.: 0.00
## Max. :200.000 Max. :15.000 Max. :900.0000 Max. :9597.17
## Amount Total Cost Total Volume of Bookings
## Min. : 0 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 0 1st Qu.: 2.31 1st Qu.: 0.0000
## Median : 0 Median : 6.76 Median : 0.0000
## Mean : 1034 Mean : 167.48 Mean : 0.8734
## 3rd Qu.: 0 3rd Qu.: 28.49 3rd Qu.: 0.0000
## Max. :567463 Max. :46188.44 Max. :439.0000
str(doubleclick)
## 'data.frame': 4510 obs. of 23 variables:
## $ Publisher ID : chr "K2615" "K2615" "K2003" "K1175" ...
## $ Publisher Name : chr "Yahoo - US" "Yahoo - US" "MSN - Global" "Google - Global" ...
## $ Keyword ID : chr "43000000039657988" "43000000039651113" "43000000019452431" "43000000005663331" ...
## $ Keyword : chr "fly to florence" "low international airfare" "air discount france ticket" "[airfrance]" ...
## $ Match Type : chr "Advanced" "Advanced" "Broad" "Exact" ...
## $ Campaign : chr "Western Europe Destinations" "Geo Targeted DC" "Air France Brand & French Destinations" "Air France Global Campaign" ...
## $ Keyword Group : chr "Florence" "Low International DC" "France" "Air France" ...
## $ Category : chr "uncategorized" "uncategorized" "uncategorized" "airfrance" ...
## $ Bid Strategy : chr NA NA "Position 2-5 Bid Strategy" "Position 1- 3" ...
## $ Keyword Type : chr "Unassigned" "Unassigned" "Unassigned" "Unassigned" ...
## $ Status : chr "Live" "Paused" "Deactivated" "Unavailable" ...
## $ Search Engine Bid : num 6.25 6.25 0 7.5 0.25 0.125 6.25 0.325 7.5 5 ...
## $ Clicks : num 1 1 1 59 8 42 3 47 13 19 ...
## $ Click Charges : num 2.312 0.625 0.388 2.312 2.2 ...
## $ Avg. Cost per Click : num 2.3125 0.625 0.3875 0.0392 0.275 ...
## $ Impressions : num 11 6 9 401 318 722 13 547 448 129 ...
## $ Engine Click Thru % : num 9.09 16.67 11.11 14.71 2.52 ...
## $ Avg. Pos. : num 1.27 1 1.11 2 2.98 ...
## $ Trans. Conv. % : num 900 100 100 3.39 12.5 ...
## $ Total Cost/ Trans. : num 0.257 0.625 0.388 1.156 2.2 ...
## $ Amount : num 8778 1574 390 1665 935 ...
## $ Total Cost : num 2.312 0.625 0.388 2.312 2.2 ...
## $ Total Volume of Bookings: num 9 1 1 2 1 2 1 2 1 1 ...
#Look for weird stuff
table(doubleclick$`Match Type`)
##
## Advanced Broad Exact N/A Standard
## 969 2591 22 48 880
# The NAs have to be removed.
doubleclick_clean <- na.omit(doubleclick)
# Notice how the number of rows gets reduced
print(nrow(doubleclick_clean))
## [1] 3286
# Look for Spelling mistakes
unique(doubleclick_clean $`Bid Strategy`)
## [1] "Position 2-5 Bid Strategy" "Position 1- 3"
## [3] "Position 1-2 Target" "Position 5-10 Bid Strategy"
## [5] "Position 1-4 Bid Strategy" "Position 1 -2 Target"
## [7] "Postiion 1-4 Bid Strategy" "Pos 3-6"
# Replace Typos
doubleclick_clean$`Bid Strategy` <- gsub("Postiion 1-4 Bid Strategy","Position 1-4 Bid Strategy",doubleclick_clean$`Bid Strategy`)
doubleclick_clean$`Bid Strategy` <- gsub("Position 1 -2 Target","Position 1-2 Target",doubleclick_clean$`Bid Strategy`)
# Create data set for analysis
sem <- doubleclick_clean[,c('Campaign','Keyword','Keyword Group','Publisher Name', 'Bid Strategy','Engine Click Thru %','Match Type','Trans. Conv. %','Total Cost/ Trans.')]
Descriptive
# Find out publishers
table(sem$`Publisher Name`)
##
## Google - Global Google - US MSN - Global Overture - Global
## 264 2003 99 271
## Overture - US Yahoo - US
## 648 1
# Visual Way
plot_ly(x = sem$'Publisher Name', type = "histogram")
# Average out the clickthroughs per publisher
clickthrough_publisher <- aggregate(sem$`Engine Click Thru %`, by=list(sem$`Publisher Name`), FUN=mean)
# Visualize average clickthroughs per publisher
plot_ly(clickthrough_publisher, x = clickthrough_publisher$`Group.1`, y=~`x`)
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
# Average out the costs per click per publisher
costs_publisher <- aggregate(sem$`Total Cost/ Trans.`, by=list(sem$`Publisher Name`), FUN=mean)
# Visualize average clickthroughs per engine
plot_ly(costs_publisher, x = costs_publisher$`Group.1`, y=~`x`)
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
It seems like Google-US has the highest clickthrough rate and the costs / click are unusually high for Yahoo - US.
# Visualize impressions per campaign
plot_ly(doubleclick_clean, x = doubleclick_clean$`Campaign`, y=~Impressions, type='bar')
# Select observations with the highest total cost per transaction
sem_sub <- subset(sem,subset = `Total Cost/ Trans.` > 0)
# Visualize the costs per transactions for different Publisher
p <- plot_ly(sem_sub, y = ~`Total Cost/ Trans.`, color = I("black"),
alpha = 0.2, boxpoints = "suspectedoutliers")
p1 <- p %>% add_boxplot(x = ~`Publisher Name`)
p1
# Visualize the converted transactions for different bid strategies
convert_bid <- plot_ly(sem_sub, y = ~`Trans. Conv. %`, color = I("black"),
alpha = 0.2, boxpoints = "suspectedoutliers")
p2 <- p %>% add_boxplot(x = ~`Bid Strategy`)
p2
# Visualize the numerical variables in 3D-Space
plot_ly(sem, x = ~`Engine Click Thru %`, y = ~`Trans. Conv. %`, z =~`Total Cost/ Trans.`) %>%
add_markers(color = ~`Trans. Conv. %`)
Keywords
ggplot(data=doubleclick_clean, aes(x=doubleclick_clean$`Clicks`, y=doubleclick_clean$`Impressions`, color=doubleclick_clean$`Match Type`)) + geom_point()

Volume of Bookings
bookings_campaigns <- aggregate(doubleclick_clean$`Total Volume of Bookings`, by=list(doubleclick_clean$Campaign), FUN=sum)
plot_ly(bookings_campaigns, x=~Group.1, y=~x)
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
Most of the impressions got generated by unassigned keywords!